import whisper
from pydub import AudioSegment
import numpy as np
import opencc

# 初始化模型
model = whisper.load_model("small")

# 加载音频文件
audio = AudioSegment.from_mp3("1.mp3")
CHUNK_MS = 3000  # 每3秒一段
total_length = len(audio)

# 初始化转换器（繁体 -> 简体）
converter = opencc.OpenCC('t2s')

for i in range(0, total_length, CHUNK_MS):
    chunk = audio[i:i + CHUNK_MS]
    chunk.export("temp.wav", format="wav")
    result = model.transcribe("temp.wav", language="zh", fp16=False)
    text = result["text"]

    # 转换为简体中文
    simplified_text = converter.convert(text)

    print(f"[{i // 1000}s - {(i + CHUNK_MS) // 1000}s]: {simplified_text}")
